{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio.SeqUtils.ProtParam import ProteinAnalysis\n",
    "from Bio import SeqIO\n",
    "import pandas as pd\n",
    "from pprint import pprint\n",
    "import re\n",
    "import ujson as json\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "fastaFile = '../Sequences/Protein/sequences.fasta'\n",
    "workdir = '../assets/ProtParam/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "pat = re.compile('([A-z0-9_]+)\\s+\\|([A-z0-9_\\'-]+)[\\s\\|]+')\n",
    "\n",
    "attr_lyst = (\n",
    "    'aromaticity',\n",
    "    'count_amino_acids',\n",
    "    'flexibility',\n",
    "    'get_amino_acids_percent',\n",
    "    'gravy',\n",
    "    'instability_index',\n",
    "    'isoelectric_point',\n",
    "    'length',\n",
    "    'molar_extinction_coefficient',\n",
    "    'molecular_weight',\n",
    "    'monoisotopic',\n",
    "    'secondary_structure_fraction')\n",
    "\n",
    "def protParam(seq: str, description: str):\n",
    "    analysed_seq = ProteinAnalysis(seq)\n",
    "    for attr in attr_lyst:\n",
    "        func = getattr(analysed_seq, attr)\n",
    "        if not callable(func):\n",
    "            yield attr, func\n",
    "        else:\n",
    "            yield attr, func()\n",
    "    accession, protein = pat.search(description).groups()\n",
    "    for res in (('accession', accession), ('protein', protein)):\n",
    "        yield res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>aromaticity</th>\n",
       "      <th>count_amino_acids</th>\n",
       "      <th>flexibility</th>\n",
       "      <th>get_amino_acids_percent</th>\n",
       "      <th>gravy</th>\n",
       "      <th>instability_index</th>\n",
       "      <th>isoelectric_point</th>\n",
       "      <th>length</th>\n",
       "      <th>molar_extinction_coefficient</th>\n",
       "      <th>molecular_weight</th>\n",
       "      <th>monoisotopic</th>\n",
       "      <th>secondary_structure_fraction</th>\n",
       "      <th>accession</th>\n",
       "      <th>protein</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.101930</td>\n",
       "      <td>{'A': 309, 'C': 138, 'D': 211, 'E': 239, 'F': ...</td>\n",
       "      <td>[1.0167738095238095, 0.9989523809523808, 0.985...</td>\n",
       "      <td>{'A': 0.07014755959137343, 'C': 0.031328036322...</td>\n",
       "      <td>-0.023314</td>\n",
       "      <td>34.924048</td>\n",
       "      <td>6.035828</td>\n",
       "      <td>4405</td>\n",
       "      <td>(543550, 552175)</td>\n",
       "      <td>489983.4465</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.33371169125993183, 0.21679909194097619, 0.2...</td>\n",
       "      <td>YP_009725295</td>\n",
       "      <td>orf1a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.186047</td>\n",
       "      <td>{'A': 2, 'C': 2, 'D': 2, 'E': 3, 'F': 6, 'G': ...</td>\n",
       "      <td>[0.9632500000000002, 0.9761666666666665, 0.998...</td>\n",
       "      <td>{'A': 0.046511627906976744, 'C': 0.04651162790...</td>\n",
       "      <td>1.448837</td>\n",
       "      <td>50.963023</td>\n",
       "      <td>4.167297</td>\n",
       "      <td>43</td>\n",
       "      <td>(6990, 7115)</td>\n",
       "      <td>5180.2131</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.5813953488372092, 0.06976744186046512, 0.41...</td>\n",
       "      <td>YP_009725296</td>\n",
       "      <td>ORF7b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.061111</td>\n",
       "      <td>{'A': 7, 'C': 1, 'D': 9, 'E': 18, 'F': 5, 'G':...</td>\n",
       "      <td>[1.0167738095238095, 0.9989523809523808, 0.985...</td>\n",
       "      <td>{'A': 0.03888888888888889, 'C': 0.005555555555...</td>\n",
       "      <td>-0.378333</td>\n",
       "      <td>28.832222</td>\n",
       "      <td>5.363953</td>\n",
       "      <td>180</td>\n",
       "      <td>(12950, 12950)</td>\n",
       "      <td>19775.0617</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.3111111111111111, 0.25555555555555554, 0.27...</td>\n",
       "      <td>YP_009725297</td>\n",
       "      <td>leader</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.024096</td>\n",
       "      <td>{'A': 5, 'C': 3, 'D': 5, 'E': 5, 'F': 1, 'G': ...</td>\n",
       "      <td>[0.9924404761904762, 1.0428214285714286, 0.966...</td>\n",
       "      <td>{'A': 0.060240963855421686, 'C': 0.03614457831...</td>\n",
       "      <td>0.198795</td>\n",
       "      <td>51.968675</td>\n",
       "      <td>5.179260</td>\n",
       "      <td>83</td>\n",
       "      <td>(5500, 5625)</td>\n",
       "      <td>9239.7338</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.3373493975903615, 0.1686746987951807, 0.349...</td>\n",
       "      <td>YP_009725303</td>\n",
       "      <td>nsp7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.101156</td>\n",
       "      <td>{'A': 16, 'C': 5, 'D': 23, 'E': 23, 'F': 21, '...</td>\n",
       "      <td>[1.0004642857142858, 0.9671071428571429, 1.002...</td>\n",
       "      <td>{'A': 0.046242774566473986, 'C': 0.01445086705...</td>\n",
       "      <td>-0.075723</td>\n",
       "      <td>36.282399</td>\n",
       "      <td>5.056580</td>\n",
       "      <td>346</td>\n",
       "      <td>(32890, 33140)</td>\n",
       "      <td>38812.9124</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.36127167630057805, 0.21098265895953758, 0.2...</td>\n",
       "      <td>YP_009725310</td>\n",
       "      <td>endoRNAse</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1044</th>\n",
       "      <td>0.132231</td>\n",
       "      <td>{'A': 5, 'C': 7, 'D': 7, 'E': 6, 'F': 8, 'G': ...</td>\n",
       "      <td>[0.9480595238095239, 0.9510476190476191, 0.964...</td>\n",
       "      <td>{'A': 0.04132231404958678, 'C': 0.057851239669...</td>\n",
       "      <td>0.219008</td>\n",
       "      <td>45.791074</td>\n",
       "      <td>5.416809</td>\n",
       "      <td>121</td>\n",
       "      <td>(15930, 16305)</td>\n",
       "      <td>13830.8536</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.396694214876033, 0.19008264462809918, 0.181...</td>\n",
       "      <td>QHD43422</td>\n",
       "      <td>ORF8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1045</th>\n",
       "      <td>0.098361</td>\n",
       "      <td>{'A': 1, 'C': 0, 'D': 4, 'E': 5, 'F': 3, 'G': ...</td>\n",
       "      <td>[0.9877738095238093, 0.9425357142857143, 0.995...</td>\n",
       "      <td>{'A': 0.01639344262295082, 'C': 0.0, 'D': 0.06...</td>\n",
       "      <td>0.232787</td>\n",
       "      <td>31.162295</td>\n",
       "      <td>4.677063</td>\n",
       "      <td>61</td>\n",
       "      <td>(8480, 8480)</td>\n",
       "      <td>7272.4566</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.4426229508196722, 0.14754098360655737, 0.27...</td>\n",
       "      <td>QHD43420</td>\n",
       "      <td>ORF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1046</th>\n",
       "      <td>0.069212</td>\n",
       "      <td>{'A': 37, 'C': 0, 'D': 24, 'E': 12, 'F': 13, '...</td>\n",
       "      <td>[1.044154761904762, 1.0403214285714284, 1.0424...</td>\n",
       "      <td>{'A': 0.0883054892601432, 'C': 0.0, 'D': 0.057...</td>\n",
       "      <td>-0.971360</td>\n",
       "      <td>55.083103</td>\n",
       "      <td>10.069031</td>\n",
       "      <td>419</td>\n",
       "      <td>(43890, 43890)</td>\n",
       "      <td>45625.1383</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.18615751789976134, 0.31026252983293556, 0.1...</td>\n",
       "      <td>QHD43423</td>\n",
       "      <td>nucleocapsid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1047</th>\n",
       "      <td>0.123967</td>\n",
       "      <td>{'A': 9, 'C': 6, 'D': 2, 'E': 8, 'F': 10, 'G':...</td>\n",
       "      <td>[0.9447142857142858, 0.9467857142857143, 0.950...</td>\n",
       "      <td>{'A': 0.0743801652892562, 'C': 0.0495867768595...</td>\n",
       "      <td>0.318182</td>\n",
       "      <td>48.655372</td>\n",
       "      <td>8.230774</td>\n",
       "      <td>121</td>\n",
       "      <td>(7450, 7825)</td>\n",
       "      <td>13744.0136</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.3801652892561984, 0.15702479338842976, 0.27...</td>\n",
       "      <td>QHD43421</td>\n",
       "      <td>ORF7a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048</th>\n",
       "      <td>0.120000</td>\n",
       "      <td>{'A': 4, 'C': 3, 'D': 1, 'E': 2, 'F': 5, 'G': ...</td>\n",
       "      <td>[1.0188809523809523, 1.0245238095238096, 1.028...</td>\n",
       "      <td>{'A': 0.05333333333333334, 'C': 0.04, 'D': 0.0...</td>\n",
       "      <td>1.128000</td>\n",
       "      <td>38.676133</td>\n",
       "      <td>8.568787</td>\n",
       "      <td>75</td>\n",
       "      <td>(5960, 6085)</td>\n",
       "      <td>8364.9414</td>\n",
       "      <td>False</td>\n",
       "      <td>(0.52, 0.21333333333333335, 0.28)</td>\n",
       "      <td>QHD43418</td>\n",
       "      <td>envelope</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1049 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      aromaticity                                  count_amino_acids  \\\n",
       "0        0.101930  {'A': 309, 'C': 138, 'D': 211, 'E': 239, 'F': ...   \n",
       "1        0.186047  {'A': 2, 'C': 2, 'D': 2, 'E': 3, 'F': 6, 'G': ...   \n",
       "2        0.061111  {'A': 7, 'C': 1, 'D': 9, 'E': 18, 'F': 5, 'G':...   \n",
       "3        0.024096  {'A': 5, 'C': 3, 'D': 5, 'E': 5, 'F': 1, 'G': ...   \n",
       "4        0.101156  {'A': 16, 'C': 5, 'D': 23, 'E': 23, 'F': 21, '...   \n",
       "...           ...                                                ...   \n",
       "1044     0.132231  {'A': 5, 'C': 7, 'D': 7, 'E': 6, 'F': 8, 'G': ...   \n",
       "1045     0.098361  {'A': 1, 'C': 0, 'D': 4, 'E': 5, 'F': 3, 'G': ...   \n",
       "1046     0.069212  {'A': 37, 'C': 0, 'D': 24, 'E': 12, 'F': 13, '...   \n",
       "1047     0.123967  {'A': 9, 'C': 6, 'D': 2, 'E': 8, 'F': 10, 'G':...   \n",
       "1048     0.120000  {'A': 4, 'C': 3, 'D': 1, 'E': 2, 'F': 5, 'G': ...   \n",
       "\n",
       "                                            flexibility  \\\n",
       "0     [1.0167738095238095, 0.9989523809523808, 0.985...   \n",
       "1     [0.9632500000000002, 0.9761666666666665, 0.998...   \n",
       "2     [1.0167738095238095, 0.9989523809523808, 0.985...   \n",
       "3     [0.9924404761904762, 1.0428214285714286, 0.966...   \n",
       "4     [1.0004642857142858, 0.9671071428571429, 1.002...   \n",
       "...                                                 ...   \n",
       "1044  [0.9480595238095239, 0.9510476190476191, 0.964...   \n",
       "1045  [0.9877738095238093, 0.9425357142857143, 0.995...   \n",
       "1046  [1.044154761904762, 1.0403214285714284, 1.0424...   \n",
       "1047  [0.9447142857142858, 0.9467857142857143, 0.950...   \n",
       "1048  [1.0188809523809523, 1.0245238095238096, 1.028...   \n",
       "\n",
       "                                get_amino_acids_percent     gravy  \\\n",
       "0     {'A': 0.07014755959137343, 'C': 0.031328036322... -0.023314   \n",
       "1     {'A': 0.046511627906976744, 'C': 0.04651162790...  1.448837   \n",
       "2     {'A': 0.03888888888888889, 'C': 0.005555555555... -0.378333   \n",
       "3     {'A': 0.060240963855421686, 'C': 0.03614457831...  0.198795   \n",
       "4     {'A': 0.046242774566473986, 'C': 0.01445086705... -0.075723   \n",
       "...                                                 ...       ...   \n",
       "1044  {'A': 0.04132231404958678, 'C': 0.057851239669...  0.219008   \n",
       "1045  {'A': 0.01639344262295082, 'C': 0.0, 'D': 0.06...  0.232787   \n",
       "1046  {'A': 0.0883054892601432, 'C': 0.0, 'D': 0.057... -0.971360   \n",
       "1047  {'A': 0.0743801652892562, 'C': 0.0495867768595...  0.318182   \n",
       "1048  {'A': 0.05333333333333334, 'C': 0.04, 'D': 0.0...  1.128000   \n",
       "\n",
       "      instability_index  isoelectric_point  length  \\\n",
       "0             34.924048           6.035828    4405   \n",
       "1             50.963023           4.167297      43   \n",
       "2             28.832222           5.363953     180   \n",
       "3             51.968675           5.179260      83   \n",
       "4             36.282399           5.056580     346   \n",
       "...                 ...                ...     ...   \n",
       "1044          45.791074           5.416809     121   \n",
       "1045          31.162295           4.677063      61   \n",
       "1046          55.083103          10.069031     419   \n",
       "1047          48.655372           8.230774     121   \n",
       "1048          38.676133           8.568787      75   \n",
       "\n",
       "     molar_extinction_coefficient  molecular_weight  monoisotopic  \\\n",
       "0                (543550, 552175)       489983.4465         False   \n",
       "1                    (6990, 7115)         5180.2131         False   \n",
       "2                  (12950, 12950)        19775.0617         False   \n",
       "3                    (5500, 5625)         9239.7338         False   \n",
       "4                  (32890, 33140)        38812.9124         False   \n",
       "...                           ...               ...           ...   \n",
       "1044               (15930, 16305)        13830.8536         False   \n",
       "1045                 (8480, 8480)         7272.4566         False   \n",
       "1046               (43890, 43890)        45625.1383         False   \n",
       "1047                 (7450, 7825)        13744.0136         False   \n",
       "1048                 (5960, 6085)         8364.9414         False   \n",
       "\n",
       "                           secondary_structure_fraction     accession  \\\n",
       "0     (0.33371169125993183, 0.21679909194097619, 0.2...  YP_009725295   \n",
       "1     (0.5813953488372092, 0.06976744186046512, 0.41...  YP_009725296   \n",
       "2     (0.3111111111111111, 0.25555555555555554, 0.27...  YP_009725297   \n",
       "3     (0.3373493975903615, 0.1686746987951807, 0.349...  YP_009725303   \n",
       "4     (0.36127167630057805, 0.21098265895953758, 0.2...  YP_009725310   \n",
       "...                                                 ...           ...   \n",
       "1044  (0.396694214876033, 0.19008264462809918, 0.181...      QHD43422   \n",
       "1045  (0.4426229508196722, 0.14754098360655737, 0.27...      QHD43420   \n",
       "1046  (0.18615751789976134, 0.31026252983293556, 0.1...      QHD43423   \n",
       "1047  (0.3801652892561984, 0.15702479338842976, 0.27...      QHD43421   \n",
       "1048                  (0.52, 0.21333333333333335, 0.28)      QHD43418   \n",
       "\n",
       "           protein  \n",
       "0            orf1a  \n",
       "1            ORF7b  \n",
       "2           leader  \n",
       "3             nsp7  \n",
       "4        endoRNAse  \n",
       "...            ...  \n",
       "1044          ORF8  \n",
       "1045          ORF6  \n",
       "1046  nucleocapsid  \n",
       "1047         ORF7a  \n",
       "1048      envelope  \n",
       "\n",
       "[1049 rows x 14 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seqOb = SeqIO.parse(fastaFile, \"fasta\")\n",
    "dfrm = pd.DataFrame(dict(protParam(str(record.seq).replace('X', 'A'), record.description)) for record in seqOb)\n",
    "dfrm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get PDB related records\n",
    "\n",
    "```py\n",
    "dfrm[dfrm.protein.str.lower().eq('chain')]\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "jsonFocusCols = (\n",
    "    'count_amino_acids', \n",
    "    'flexibility', \n",
    "    'get_amino_acids_percent', \n",
    "    'molar_extinction_coefficient', \n",
    "    'secondary_structure_fraction')\n",
    "\n",
    "def saveResults(dfrm: pd.DataFrame, outputFolder: str, **kwargs):\n",
    "    for col in jsonFocusCols:\n",
    "        dfrm[col] = dfrm[col].apply(json.dumps)\n",
    "    dfrm.to_csv(os.path.join(outputFolder, 'protein_seq_protparam_result.tsv'), **kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "saveResults(dfrm, workdir, sep='\\t', index=False)"
   ]
  }
 ],
 "metadata": {
  "file_extension": ".py",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
